#if(!require(installr)) {
#install.packages(installr); require(installr)} #load / install+load installr
# using the package:
#updateR()
#update.packages(ask = FALSE)
#install.packages("lifecycle")
#install.packages(tidymodels)
#install.packages("rsample")
library(readr)
library(readxl)
library(tidyr)
library(tidymodels)
library(dplyr)
library(tidyverse)
library(fastDummies)
library(reshape2)
library(ggplot2)
library(caTools)
library(relaimpo)
library(MASS)
player_df <- read_excel("../dataset-ignore/19-20 palyer total (1).xlsx")
head(player_df)
## # A tibble: 6 × 31
## Rk Player Pos Age Tm G GS MP FG FGA `FG%` `3P` `3PA`
## <dbl> <chr> <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 Steve… C 26 OKC 63 63 1680 283 478 0.592 1 3
## 2 2 Bam A… PF 22 MIA 72 72 2417 440 790 0.557 2 14
## 3 3 LaMar… C 34 SAS 53 53 1754 391 793 0.493 61 157
## 4 4 Kyle … C 23 MIA 2 0 13 1 2 0.5 0 0
## 5 5 Nicke… SG 21 NOP 47 1 591 98 266 0.368 46 133
## 6 6 Grays… SG 24 MEM 38 0 718 117 251 0.466 57 141
## # … with 18 more variables: `3P%` <dbl>, `2P` <dbl>, `2PA` <dbl>, `2P%` <dbl>,
## # `eFG%` <dbl>, FT <dbl>, FTA <dbl>, `FT%` <dbl>, ORB <dbl>, DRB <dbl>,
## # TRB <dbl>, AST <dbl>, STL <dbl>, BLK <dbl>, TOV <dbl>, PF <dbl>, PTS <dbl>,
## # `Player-additional` <chr>
play_num = subset(player_df, select = -c(Player, Rk,Tm,`Player-additional`,GS,FG, FGA,`2P`,`3PA`,`2PA`,`2P%`, `3P%`, `eFG%`, `FG%`,FT, FTA,TRB, PTS))
play_num <- drop_na(play_num)
length(play_num)
## [1] 13
dummy <- dummy_cols(play_num, select_columns = c("Pos"), remove_first_dummy = TRUE)
model_data <- subset(dummy, select = -c(Pos))
num_data <- play_num %>% dplyr::select(where(is.numeric))
summary(num_data)
## Age G MP 3P
## Min. :19.00 Min. : 1.00 Min. : 4.0 Min. : 0.00
## 1st Qu.:23.00 1st Qu.:19.00 1st Qu.: 270.5 1st Qu.: 4.25
## Median :25.00 Median :43.00 Median : 806.5 Median : 28.00
## Mean :25.83 Mean :40.03 Mean : 908.8 Mean : 46.07
## 3rd Qu.:28.00 3rd Qu.:60.00 3rd Qu.:1477.2 3rd Qu.: 70.00
## Max. :43.00 Max. :74.00 Max. :2556.0 Max. :299.00
## FT% ORB DRB AST
## Min. :0.0000 Min. : 0.00 Min. : 0.0 Min. : 0.0
## 1st Qu.:0.6670 1st Qu.: 9.00 1st Qu.: 37.0 1st Qu.: 18.0
## Median :0.7705 Median : 24.00 Median : 96.0 Median : 52.5
## Mean :0.7438 Mean : 37.86 Mean :130.8 Mean : 90.2
## 3rd Qu.:0.8360 3rd Qu.: 52.00 3rd Qu.:192.8 3rd Qu.:115.0
## Max. :1.0000 Max. :258.00 Max. :716.0 Max. :684.0
## STL BLK TOV PF
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 7.25 1st Qu.: 4.00 1st Qu.: 14.00 1st Qu.: 28.25
## Median : 23.00 Median : 10.00 Median : 37.00 Median : 74.00
## Mean : 28.93 Mean : 18.59 Mean : 52.12 Mean : 78.58
## 3rd Qu.: 45.00 3rd Qu.: 24.00 3rd Qu.: 74.75 3rd Qu.:122.00
## Max. :125.00 Max. :196.00 Max. :308.00 Max. :278.00
Table 1
The descriptive statistic output shows the mean age of NBA players is approximately 26 years (25.83). The oldest player is 43 years and the youngest is 19 resulting in a range of about 25 years. The maximum number of games a player has started (G) is 74 and the minimum is 1, and this variation could be related to player performance whereby a high-performing player is prioritized. The same logic applies to minutes played per game (MP). Our target variable is 3P, which is the number of 3-point field goals per game. The mean 3-P score is 46.07, the minimum is 0 and the maximum is 299 per game. The 75th quartile is 201, implying that at least 25% of all NBA players score make about 70 3P scores per game. This cohort may represent elite players with consistently good performance, hence, sports organizations should focus on not to lose them to rival teams The histogram below illustrates the distribution of 3P scoring. It is evident that distribution of 3P scores is skewed to the right, as evidenced by most values that lie on the left side of the chart. This means that most NBA players score fewer 3P field goals. However, we can discern the presence of outliers – a bar far away from the rest of other bars. This outlier represents NBA players with exceptionally high 3P scores compared to the general population of NBA players. We are more interested in the general population, and hence, it was worthwhile to remove it to avoid skewing the performance of our model.
hist(num_data$`3P`, main = " Distribution of 3P", xlab = 'bins')
Figure 1 Distribution of 3P Scores
boxplot(play_num$`3P`~ play_num$Pos,
main = ' A Boxplot of 3 Point Scoring by Player Position',
ylab = '3 Point Scoring (3P)', xlab = 'Player Position', col = 'yellow', border = 'brown')
Figure 2 Side-to-Side Boxplot of 3 Point Scores by Position
The boxplot above captures the distribution of 3-point scores by player position. Based on the plot, we can discern a significant difference in 3P scores across player positions, with SF-PF registering the highest mean 3P scores, followed at distance by SF-FG, and SG-PG. This finding supports the inclusion of player position as a predictor of the 3P score and checks whether the existing relationship is statistically significant.
cormatrx <- round(cor(num_data),3)
cormat_melt <- melt(cormatrx)
corr_heatmap <- ggplot(data = cormat_melt, aes(x = Var1, y=Var2, label = value, fill = value))+
geom_tile()
corr_heatmap+
geom_text(aes(Var1, Var2, label = value), color = "black", size = 4)
set.seed(1)
sample <- sample.split(model_data$`3P`, SplitRatio = 0.7)
train <- subset(model_data, sample == TRUE)
test <- subset(model_data, sample == FALSE)
Figure 3 Correlation Plot
The correlation plot above identifies the direction and magnitude of association between pairs of variables in our dataset. It is evident that there is a moderate positive relationship between 3P scores and games played (G, r = 0.644), 3P scores and assists per game (AST, r = 0.613), 3P scores and turnover per game (TOV, r = 0.667), and a strong positive relationship between 3P scores and games played (G, r=0.773). We can also identify the presence of multicollinearity issues in our dataset, as evidenced by the high correlation between pairs of independent variables. For example, there is a high correlation between minutes played (MP) and personal fouls per game (PF, r = 0.897), turnover per game (TOV) and MP (r = 0.853), assist per game (AST) and turnover per game (TOV, r = 0.897). A possible approach to address this issue is to remove one of the variables from each pair. We included a cut-off of 0.85 in our regression model funciton to eliminate one of the independent variables that are highly correlated, preferably one with a higher p-value.
It is worthwhile to note that issues such as missing values and outliers were addressed by removing rows containing NA and extreme values. Our next step is to perform data transformation. Firstly, we performed dummy encoding on our only categorical data, player position, using the dummy_cols () function and removed the first dummy column to avoid the multicollinearity issue. Secondly, variables, which contain
large values, such as Age, STL (steals per game), MP (minutes played), and BLK (blocks per game) were normalized. Most importantly, all these transformations, including splitting data into training sets were done at once using the recipe function in R and the results were also mirrored to the testing data using the bake function.
rec <- recipe(
`3P` ~.,
data = train)%>%
step_normalize(Age, STL, MP, BLK)%>%
step_corr(all_numeric_predictors(), threshold = 0.85)
log.rec.prep <- prep(rec, training = train)
train_set <- log.rec.prep %>%
bake(new_data = NULL)
l.model = lm(`3P`~., data = train_set)
summary(l.model)
##
## Call:
## lm(formula = `3P` ~ ., data = train_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -148.376 -14.064 -1.441 11.510 145.102
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -29.77606 9.13458 -3.260 0.001206 **
## Age 1.65163 1.48740 1.110 0.267453
## G 0.97650 0.11234 8.692 < 2e-16 ***
## `FT%` 37.40258 10.65237 3.511 0.000494 ***
## ORB -0.76994 0.07675 -10.032 < 2e-16 ***
## DRB 0.19292 0.03176 6.074 2.79e-09 ***
## AST 0.11406 0.02531 4.507 8.52e-06 ***
## STL 6.15142 2.84676 2.161 0.031271 *
## BLK 1.48954 2.49903 0.596 0.551464
## `Pos_C-PF` 16.42202 30.75384 0.534 0.593635
## Pos_PF 3.89151 4.88875 0.796 0.426473
## `Pos_PF-C` 6.72209 18.24347 0.368 0.712711
## `Pos_PF-SF` -17.63716 21.96377 -0.803 0.422421
## Pos_PG -10.07369 6.45317 -1.561 0.119264
## `Pos_PG-SG` -4.40865 30.77196 -0.143 0.886147
## Pos_SF 4.08671 5.53191 0.739 0.460469
## `Pos_SF-C` NA NA NA NA
## `Pos_SF-PF` 14.96257 21.99959 0.680 0.496796
## `Pos_SF-SG` 9.59393 30.96106 0.310 0.756813
## Pos_SG 11.67706 5.52524 2.113 0.035154 *
## `Pos_SG-PG` 20.92139 30.75211 0.680 0.496674
## `Pos_SG-SF` -1.45232 30.73982 -0.047 0.962340
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30.41 on 421 degrees of freedom
## Multiple R-squared: 0.6852, Adjusted R-squared: 0.6703
## F-statistic: 45.83 on 20 and 421 DF, p-value: < 2.2e-16
Figure 4 Final Model
While all 25 predictor variables namely, age, G, MP, STL, FT%, ORB, AST, BLK, TOV, PF, and POS (14 dummy variables) were in the model, the StepAIC selected only 8 variables to statistically significant at 0.05 and with the lowest AIC value. The selected variables include G, FT%, ORB, AST, Pos_PG and Pos_SG. The coefficient on G is 0.9795, implying that a unit increase in the number of games played increases 3 points by at least 0.97, approximately 1, presuming all other factors are held constant. The coefficient on Pos_SG (SG = 1, not SG = 0) is 7.993, implying that NBA players in SG (shooting guard) position score 8 3-P scores higher than all other positions, when other factors are also held constant. In contrast, NBA players in PG positions score 13.60 (approximately 14) less 3P scores
compared to players in other positions. The global F test informs us to reject the null hypothesis all coefficients are equal to zero because there is sufficient evidence to indicate that at least one covariate is statistically different from zero, F (8, 433) = 116.2, p-value = 2.2e-16 <0.05. Lastly, the adjusted R- squared is 0.6763, implying that this final model explains at least 67.63% of the variation in 3-P scores across NBA players. The resulting RMSE from prediction on test data is 30.583.
testing_set <- bake(log.rec.prep, test)
test[1:10, names(testing_set)]
## # A tibble: 10 × 22
## Age G `FT%` ORB DRB AST STL BLK `Pos_C-PF` Pos_PF `Pos_PF-C`
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <int> <int>
## 1 27 10 0.636 2 7 21 5 2 0 0 0
## 2 29 18 0.655 24 63 21 18 8 0 1 0
## 3 26 10 0.5 1 20 8 0 6 0 0 0
## 4 22 5 0.5 2 1 2 0 0 0 1 0
## 5 22 69 0.706 80 284 108 96 45 0 0 0
## 6 34 53 0.838 34 212 91 69 15 0 0 0
## 7 34 32 0.778 21 125 50 35 7 0 0 0
## 8 19 56 0.614 50 229 143 55 17 0 0 0
## 9 29 58 0.767 76 289 212 61 29 0 0 0
## 10 31 22 0.9 25 75 66 17 8 0 0 0
## # … with 11 more variables: `Pos_PF-SF` <int>, Pos_PG <int>, `Pos_PG-SG` <int>,
## # Pos_SF <int>, `Pos_SF-C` <int>, `Pos_SF-PF` <int>, `Pos_SF-SG` <int>,
## # Pos_SG <int>, `Pos_SG-PG` <int>, `Pos_SG-SF` <int>, `3P` <dbl>
fitted = predict(l.model, newdata = testing_set)
## Warning in predict.lm(l.model, newdata = testing_set): prediction from a rank-
## deficient fit may be misleading
actuals = as.numeric(testing_set$`3P`)
sqrt(mean((actuals - fitted)^2))
## [1] 30.18264
l.model.fin <- lm(`3P` ~., data = train_set) %>%
stepAIC(trace = FALSE)
summary(l.model.fin)
##
## Call:
## lm(formula = `3P` ~ G + `FT%` + ORB + DRB + AST + STL + Pos_PG +
## Pos_SG, data = train_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -147.628 -13.627 -1.535 11.659 144.382
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -28.42290 8.41698 -3.377 0.000799 ***
## G 0.97954 0.11033 8.878 < 2e-16 ***
## `FT%` 38.85695 10.36809 3.748 0.000203 ***
## ORB -0.78189 0.06655 -11.749 < 2e-16 ***
## DRB 0.20192 0.03053 6.614 1.11e-10 ***
## AST 0.11160 0.02425 4.603 5.49e-06 ***
## STL 6.72707 2.67047 2.519 0.012126 *
## Pos_PG -13.60417 5.12053 -2.657 0.008180 **
## Pos_SG 7.99634 4.00340 1.997 0.046408 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30.13 on 433 degrees of freedom
## Multiple R-squared: 0.6822, Adjusted R-squared: 0.6763
## F-statistic: 116.2 on 8 and 433 DF, p-value: < 2.2e-16
testing_set <- bake(log.rec.prep, test)
test[1:10, names(testing_set)]
## # A tibble: 10 × 22
## Age G `FT%` ORB DRB AST STL BLK `Pos_C-PF` Pos_PF `Pos_PF-C`
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <int> <int>
## 1 27 10 0.636 2 7 21 5 2 0 0 0
## 2 29 18 0.655 24 63 21 18 8 0 1 0
## 3 26 10 0.5 1 20 8 0 6 0 0 0
## 4 22 5 0.5 2 1 2 0 0 0 1 0
## 5 22 69 0.706 80 284 108 96 45 0 0 0
## 6 34 53 0.838 34 212 91 69 15 0 0 0
## 7 34 32 0.778 21 125 50 35 7 0 0 0
## 8 19 56 0.614 50 229 143 55 17 0 0 0
## 9 29 58 0.767 76 289 212 61 29 0 0 0
## 10 31 22 0.9 25 75 66 17 8 0 0 0
## # … with 11 more variables: `Pos_PF-SF` <int>, Pos_PG <int>, `Pos_PG-SG` <int>,
## # Pos_SF <int>, `Pos_SF-C` <int>, `Pos_SF-PF` <int>, `Pos_SF-SG` <int>,
## # Pos_SG <int>, `Pos_SG-PG` <int>, `Pos_SG-SF` <int>, `3P` <dbl>
fitted = predict(l.model.fin, newdata = testing_set)
actuals = as.numeric(testing_set$`3P`)
sqrt(mean((actuals - fitted)^2))
## [1] 30.58373
plot(fitted,
xlab = "Predicted Values",
ylab = "Observed Values")+
geom_point()+
abline(a = 0,
b = 1,
col = "red",
lwd = 2)
## integer(0)
rel_imp <- calc.relimp(l.model.fin, type = c("lmg"), rela = TRUE)
boo_rslts <- boot.relimp(l.model.fin, b=1000)
ci <- booteval.relimp(boo_rslts, norank = T)
plot(ci)
Figure 5 Relative Importance
Based on the bar chart, games played (G) are ranked as the important predictor of 3-P scores, followed closely by AST (number of assists), then STL (steals per game), (DRB) defensive, and (ORB) offensive rebounds. Scouts should pay more attention to these characteristics when scouting for NBA players to play in the professional league as it will guarantee positive returns on their investments.
Analyis Part 2
#load data into R
nba_data <- read_csv("../dataset-ignore/NBA Stats.csv")
head(nba_data) #glimpse of the data
## # A tibble: 6 × 34
## ...1 Player Ratings Team Age Height Weight College Country Draft…¹ Draft…²
## <dbl> <chr> <dbl> <chr> <dbl> <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 0 Kent … 76 ATL 29 6-5 201 Old Domini… USA Undraf…
## 2 1 Deway… 77 ATL 29 7-0 245 Southe… Califo… USA Undraf…
## 3 2 John … 84 ATL 21 6-10 235 Wake Forest USA 2017
## 4 3 Trae … 84 ATL 20 6-2 180 Oklaho… USA 2018 1
## 5 4 Kevin… 74 ATL 20 6-7 190 Maryla… USA 2018 1
## 6 5 Vince… 73 ATL 42 6-6 220 North Caroli… USA 1998
## # … with 23 more variables: `Draft Number` <chr>, Shoes <chr>, `2018/19` <chr>,
## # gp <dbl>, min <dbl>, pts <dbl>, fgm <dbl>, fga <dbl>, `fg%` <dbl>,
## # `3pm` <dbl>, `3pa` <dbl>, `3p%` <dbl>, ftm <dbl>, fta <dbl>, `ft%` <dbl>,
## # oreb <dbl>, dreb <dbl>, reb <dbl>, ast <dbl>, stl <dbl>, blk <dbl>,
## # tov <dbl>, eff <dbl>, and abbreviated variable names ¹`Draft Year`,
## # ²`Draft Round`
#Data Preprocessing and Preparation
#filter to select players who made at least 1.5 3s per game
nba_data <- nba_data %>%
filter(`3pm` >= 1.5)
nba_data$Salary = nba_data$`2018/19`
#remove the $ sign
nba_data$Salary = as.numeric(gsub("[\\$,]", "", nba_data$Salary ))
#convert weight to kgs
nba_data$Weight = nba_data$Weight* 0.453592
#player height is in character form, convert to height in feet
#create a function to do so:
playerHeight <- function(x) {
x1 <- as.numeric(sub("-.*", "", x))
x2 <- as.numeric(sub(".*-", "", x))
(x1 * 12) + x2
}
#apply the function to the height data values
nba_data$Height <- playerHeight(nba_data$Height)
#select relevant variables into 1 data frame
nba_df = subset(nba_data, select = c(Age,gp,min,ast,oreb,dreb,fta, stl,Shoes,blk,tov,Salary,Height, Weight,`3pm`))
#Exploratory Data Analysis (EDA)
#descriptive stats
summary(nba_df)
## Age gp min ast
## Min. :19.00 Min. :58.00 Min. :17.20 Min. : 0.500
## 1st Qu.:24.00 1st Qu.:69.00 1st Qu.:27.30 1st Qu.: 1.800
## Median :27.00 Median :76.00 Median :30.90 Median : 2.800
## Mean :27.04 Mean :74.43 Mean :29.66 Mean : 3.464
## 3rd Qu.:30.00 3rd Qu.:80.00 3rd Qu.:33.70 3rd Qu.: 4.600
## Max. :42.00 Max. :82.00 Max. :36.90 Max. :10.700
## oreb dreb fta stl
## Min. :0.1000 Min. :1.30 Min. : 0.600 Min. :0.4000
## 1st Qu.:0.4000 1st Qu.:2.50 1st Qu.: 1.400 1st Qu.:0.6000
## Median :0.7000 Median :3.50 Median : 2.400 Median :0.9000
## Mean :0.7358 Mean :3.69 Mean : 3.105 Mean :0.9383
## 3rd Qu.:0.9000 3rd Qu.:4.30 3rd Qu.: 4.000 3rd Qu.:1.1000
## Max. :3.4000 Max. :9.60 Max. :11.000 Max. :2.2000
## Shoes blk tov Salary
## Length:81 Min. :0.000 Min. :0.300 Min. : 1378242
## Class :character 1st Qu.:0.200 1st Qu.:1.000 1st Qu.: 3447480
## Mode :character Median :0.300 Median :1.500 Median : 9367200
## Mean :0.384 Mean :1.802 Mean :12429394
## 3rd Qu.:0.400 3rd Qu.:2.300 3rd Qu.:19360228
## Max. :2.200 Max. :5.000 Max. :37457154
## Height Weight 3pm
## Min. :72.00 Min. : 79.38 Min. :1.50
## 1st Qu.:76.00 1st Qu.: 90.72 1st Qu.:1.70
## Median :78.00 Median : 95.25 Median :1.90
## Mean :77.77 Mean : 95.23 Mean :2.11
## 3rd Qu.:80.00 3rd Qu.: 99.79 3rd Qu.:2.30
## Max. :84.00 Max. :122.47 Max. :5.10
The descriptive statistic output shows the mean age of NBA players is approximately 27 years (27.04). The oldest player is 42 years and the youngest is 19 resulting in a range of about 23 years. The mean height is 77.77 inches (approximately 6.41 ft). The shortest NBA player is 6ft tall (72 inches) while the tallest is 7ft tall (84 inches), translating to a range of 12 inches. With regard to player performance, the maximum number of games a played is (gp) is 82 and the minimum is 58, and this variation could be related to player performance whereby a high-performing player is highly prioritized in terms of games allocation. The same logic applies to minutes played (MP). Our target variable is 3Pm, which is the number of 3-point scores made per game. The mean 3-P score is 2.11, the minimum is 1.5 and the maximum 3 point scores ever made by one player in one game is 5.11. The 75th quartile is 2.30, implying that at least 25% of all NBA players made above 2.3 scores per game. This cohort may represent elite players with consistently good performance, hence, sports organizations should focus on not to lose them to rival teams.
#histogram plots
ggplot(nba_df, aes(x =`3pm`))+
geom_histogram(color = 'darkblue', fill ='lightblue')+
ggtitle('Distribution of 3PM')+
theme(plot.title = element_text(hjust = 0.5))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The histogram above illustrates the distribution of 3-p scores made (denoted as 3pm). It is evident that distribution of 3P scores is skewed to the right, as evidenced by most values that lie on the left side of the chart. This means that most NBA players score fewer 3P field goals. However, we can discern the presence of outliers – a bar far away from the rest of other bars. This outlier represents NBA players with exceptionally high 3P scores compared to the general population of NBA players. We are more interested in the general population, and hence, it was worthwhile to remove the outliers to avoid skewing the performance of our model.
#box plot of 3pm by shoes
ggplot(nba_data, aes(x = Shoes, y=`3pm`, fill=Shoes))+
geom_boxplot(outlier.colour = 'red', notch = FALSE)+
ggtitle('Box Plot of 3-Point Scores Made by Shoes Worn')+
theme(plot.title = element_text(hjust = 0.5))
Figure 2
Figure 2 is a side-to-side box plot of the dependent variable, 3pm, by shoe brand. There appears to be an association between shoe and player scoring, as evidenced by the variation in mean 3P scores. Players using under armour made the highest 3p scores, followed at a distant by Jordan, Adidas, Nike, Puma, and so on. This statistical finding supports the inclusion of shoe brand as a predictor of the 3P score and checks whether the existing relationship is statistically significant.
col_df <- nba_data %>% group_by(College)%>% summarize(Mean_3Pm= mean(`3pm`))
con_df <-nba_data %>% group_by(Country)%>% summarize(Mean_3Pm= mean(`3pm`))
plot_ly(col_df, x = ~College, y = ~Mean_3Pm, type = 'bar', text = text)
plot_ly(con_df, x = ~Country, y = ~Mean_3Pm, type = 'bar', text = text)
#correlation analysis
cor_df <- subset(nba_df, select =-c(Shoes))
cor.mat <- round(cor(cor_df),3) #correlation matrix
cormat_melt <- melt(cor.mat)
corr_heatmap <- ggplot(data = cormat_melt, aes(x = Var1, y=Var2, label = value, fill = value))+
geom_tile()
corr_heatmap+
geom_text(aes(Var1, Var2, label = value), color = "black", size = 4)
Figure 5 Correlation Matrix
The correlation plot above identifies the direction and magnitude of association between pairs of variables in our dataset. It is evident that there is a moderate positive relationship between 3P scores and assists (ast, r = 0.262), 3p scores and minutes played (min, r = 0.41), 3p scores and steals (stl, r =0.349), and 3p scores and age (r=0.221). It is interesting that correlation between player height and 3p scores is negative (r = -0.088). However, further analysis indicate that the correlation coefficient is not statistically different from zero, hence, height and 3p scores are unrelated. We can also identify the presence of multicollinearity issues in our dataset, as evidenced by the high correlation between some of the pairs of independent variables. For example, there is a high correlation between assists (ast) and turnover (tov, r = 0.858). A possible approach to address this issue is to remove one of the variables from each pair. We included a cut-off of 0.85 in our regression model function to eliminate one of the independent variables that is highly correlated, preferably one with a higher p-value.
#Interactive Charts
fig <- plot_ly(data = nba_df, x = ~Height, y = ~`3pm`)
fig <- fig %>%layout(title = 'A Scatter Plot of 3pt Percentage against Height')
fig
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
fig.2 <- plot_ly(data = nba_data, x = ~`3pm`, y = ~`3pa`,
title = "S")
fig.2 <- fig.2 %>%layout(title = 'A Scatter Plot of 3pt Percentage against 3pt Made')
fig.2
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## Warning: 'scatter' objects don't have these attributes: 'title'
## Valid attributes include:
## 'cliponaxis', 'connectgaps', 'customdata', 'customdatasrc', 'dx', 'dy', 'error_x', 'error_y', 'fill', 'fillcolor', 'fillpattern', 'groupnorm', 'hoverinfo', 'hoverinfosrc', 'hoverlabel', 'hoveron', 'hovertemplate', 'hovertemplatesrc', 'hovertext', 'hovertextsrc', 'ids', 'idssrc', 'legendgroup', 'legendgrouptitle', 'legendrank', 'line', 'marker', 'meta', 'metasrc', 'mode', 'name', 'opacity', 'orientation', 'selected', 'selectedpoints', 'showlegend', 'stackgaps', 'stackgroup', 'stream', 'text', 'textfont', 'textposition', 'textpositionsrc', 'textsrc', 'texttemplate', 'texttemplatesrc', 'transforms', 'type', 'uid', 'uirevision', 'unselected', 'visible', 'x', 'x0', 'xaxis', 'xcalendar', 'xhoverformat', 'xperiod', 'xperiod0', 'xperiodalignment', 'xsrc', 'y', 'y0', 'yaxis', 'ycalendar', 'yhoverformat', 'yperiod', 'yperiod0', 'yperiodalignment', 'ysrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
fig.3 <- plot_ly(data = nba_data, x = ~eff, y = ~`3pm`)
fig.3 <- fig %>%layout(title = 'A Scatter Plot of 3pt Percentage against Efficiency')
fig.3
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
Figure 4 Scatterplot (3pm vs Height)
We cannot discern any relationship between 3pm and height, as the points appear random, with no visual pattern to indicate association between player’s height and 3-points scores made.
fg <- plot_ly(data = nba_df, x = ~Age, y = ~`3pm`)
fg %>%layout(title = 'A Scatter Plot of 3pt Percentage against Player Age')
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
fg
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
Figure 3 Scatter plot (3pm vs Age)
The scatterplot above visualizes the relationship between player score performance (3p scores per game) by age. There appears to be a positive relationship between age and 3-points scoring. A player is at prime in mid 20s and early 30s, and start to decline as he approaches 35 years.
#Full Model
set.seed(123)
#define 3pm as y
nba_df$y = nba_df$`3pm`
#remove 3pm
nba_df = subset(nba_df, select = -(`3pm`))
#use 70% of dataset as training set and 30% as test set
sample <- sample.split(nba_df$y, SplitRatio = 0.8)
train <- subset(nba_df, sample == TRUE)
test <- subset(nba_df, sample == FALSE)
rec <- recipe(
y ~.,
data = train)%>%
step_normalize(Salary, Age, gp, min, Height, Weight)%>%
step_corr(all_numeric_predictors(), threshold = 0.85)%>%
step_dummy(Shoes)
log.rec.prep <- prep(rec, training = train) #prepare recipe
train_set <- log.rec.prep %>% #replicate data transformation results to testing data
bake(new_data = NULL)
full.model = lm(y~., data = train_set)
summary(full.model)
##
## Call:
## lm(formula = y ~ ., data = train_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.34508 -0.31909 0.00493 0.21422 1.34508
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.86389 0.38559 4.834 1.6e-05 ***
## Age 0.25070 0.10389 2.413 0.01995 *
## gp 0.08763 0.07585 1.155 0.25409
## min 0.30377 0.13823 2.198 0.03316 *
## ast -0.13182 0.06877 -1.917 0.06164 .
## oreb -0.15377 0.21977 -0.700 0.48771
## dreb -0.05807 0.09053 -0.641 0.52450
## fta 0.18309 0.06245 2.932 0.00528 **
## stl 0.48918 0.25752 1.900 0.06391 .
## blk -0.02152 0.25765 -0.084 0.93380
## Salary -0.10557 0.13114 -0.805 0.42504
## Height -0.15203 0.13879 -1.095 0.27917
## Weight 0.05863 0.12728 0.461 0.64727
## Shoes_Anta 0.60839 0.61274 0.993 0.32606
## Shoes_Jordan -0.17371 0.28513 -0.609 0.54542
## Shoes_K8IROS -0.34275 0.61660 -0.556 0.58106
## Shoes_New.Balance -1.10735 0.66754 -1.659 0.10410
## Shoes_Nike 0.04406 0.19222 0.229 0.81974
## Shoes_Puma 0.01156 0.45995 0.025 0.98005
## Shoes_Q4 -0.08735 0.62089 -0.141 0.88875
## Shoes_Under.Armour 1.29128 0.46922 2.752 0.00851 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5519 on 45 degrees of freedom
## Multiple R-squared: 0.5765, Adjusted R-squared: 0.3883
## F-statistic: 3.063 on 20 and 45 DF, p-value: 0.0009186
#Model Selection
Optimal Model: Based on the Lowest AIC Value
Figure 6 Figure 6 shows the final model containing best subsets of variables for predicting offensive performance of an NBA player, measured by 3-point scores made (3pm).The selection of the variables to include in final model was based on the lowest AIC value. We chose AIC as our ideal model selection criterion because it results in the selection of the model that bests fits the data and it is moderately tolerant compared to BIC (Yang & Berdine, 2015). The selected variables include age, games played (gp), minutes played (min), assist (ast), offensive rebounds (oreb), steals (stl), free throws (fta), and shoes. The coefficient on age, min, fta, stl, shoes (new balance and under armour) are statistically significant at 0.01 because their corresponding p- values is less than 0.01 (see figure 6). To illustrate a few variables, the coefficient on age is 0.1976, implying that an increase in player age by one year is associated with an increase in 3pt scores by 0.2, presuming other factors are held constant. Furthermore, an NBA player wearing under armour during play is likely to make more 3pt scores compared to a player wearing other brands, such as Under Armour, or Adidas. The resulting equation for predicting player performance is: Y i = 1.9627 + 0.1976X 1 + 0.091X 2 + 0.294X 3 - 0.165X 4 – 0.254X 5 – 0.152X 6 + 0.474X 7 – 0.148X 8 -1.10X 9 + 1.139X 10 , where X 1 denotes age, X 2 is gp, X 3 is min, X 4 is ast, X 5 is oreb, X 6 is fta, X 7 is stl, X 8 is height, X 9 is Shoes_New.Balance and X 10 is Shoes.Under.Armour. The global F test informs us to reject the null hypothesis all coefficients are equal to zero because there is sufficient evidence to indicate that at least one covariate is statistically different from zero, F (10, 55) = 6.503, p-value = 1.566 e-06 <0.05. Lastly, the adjusted R-squared is 0.4585, implying that this final model explains at least 45.9% of the variation in 3-P scores across NBA players.
Predictive Performance on Testing Set
Figure 7 is a scatter plot of actual vs fitted values of 3-points scores made. It is evident that most values lie close to the regression line and are within the 95% confidence level region. However, some of the values we either under-predicted or over-predicted, as evidenced by the points lying far away from the regression line and outside the grey region. The calculated RMSE is 0.4868, which is relatively close to 0, hence, the model did great in terms of predicting player performance.
#ols_step_best_subset(full.model)
#final model
set.seed(123)
final.model <- stepAIC(full.model, direction ='backward', trace = FALSE)
reg.summary = summary(final.model)
reg.summary
##
## Call:
## lm(formula = y ~ Age + gp + min + ast + oreb + fta + stl + Height +
## Shoes_New.Balance + Shoes_Under.Armour, data = train_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.2561 -0.2950 -0.0173 0.2812 1.2561
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.96274 0.25620 7.661 3.12e-10 ***
## Age 0.19760 0.07149 2.764 0.00775 **
## gp 0.09051 0.06932 1.306 0.19709
## min 0.29394 0.11231 2.617 0.01142 *
## ast -0.16537 0.05823 -2.840 0.00632 **
## oreb -0.25388 0.16733 -1.517 0.13494
## fta 0.15153 0.05017 3.020 0.00382 **
## stl 0.47401 0.23129 2.049 0.04520 *
## Height -0.14752 0.09039 -1.632 0.10839
## Shoes_New.Balance -1.10227 0.61421 -1.795 0.07821 .
## Shoes_Under.Armour 1.13869 0.39420 2.889 0.00552 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5193 on 55 degrees of freedom
## Multiple R-squared: 0.5418, Adjusted R-squared: 0.4585
## F-statistic: 6.503 on 10 and 55 DF, p-value: 1.566e-06
#make predictions on testing dataset using the full model
testing_set <- bake(log.rec.prep, test)
testing_set$fitted = predict(final.model, newdata = testing_set)
ggplot(testing_set, aes(x = y, y=fitted))+
geom_point()+
geom_smooth(method='lm')+
ggtitle('fitted vs actual plot')+
ylab('Actual')+
xlab('Fitted')+
theme(plot.title = element_text(hjust = 0.5))
## `geom_smooth()` using formula 'y ~ x'
Figure 7
#RMSE
sqrt(mean((testing_set$y - testing_set$fitted)^2))#print RMSE
## [1] 0.4868246
#relative importance of predictors
rel_imp <- calc.relimp(final.model, type = c("lmg"), rela = TRUE)
rel_imp
## Response variable: y
## Total response variance: 0.4979138
## Analysis based on 66 observations
##
## 10 Regressors:
## Age gp min ast oreb fta stl Height Shoes_New.Balance Shoes_Under.Armour
## Proportion of variance explained by model: 54.18%
## Metrics are normalized to sum to 100% (rela=TRUE).
##
## Relative importance metrics:
##
## lmg
## Age 0.13630060
## gp 0.03241851
## min 0.17495645
## ast 0.07470439
## oreb 0.04049944
## fta 0.19755163
## stl 0.12670609
## Height 0.02971409
## Shoes_New.Balance 0.02857386
## Shoes_Under.Armour 0.15857495
##
## Average coefficients for different model sizes:
##
## 1X 2Xs 3Xs 4Xs 5Xs
## Age 0.18843249 0.19682366 0.20119811 0.202749585 0.20255063
## gp 0.07341793 0.08303488 0.09118498 0.097550228 0.10198134
## min 0.28117460 0.27301450 0.26944267 0.269365898 0.27186635
## ast 0.08358611 0.05570702 0.02677829 -0.002545732 -0.03183065
## oreb -0.03286856 -0.11077718 -0.17132014 -0.216486952 -0.24813737
## fta 0.13771387 0.13554094 0.13422713 0.133845041 0.13449459
## stl 0.65297450 0.60607408 0.56231969 0.525101858 0.49662065
## Height -0.07589576 -0.06823530 -0.06386090 -0.063807021 -0.06848615
## Shoes_New.Balance -0.25692308 -0.39081974 -0.49618622 -0.581271368 -0.65560592
## Shoes_Under.Armour 1.18281250 1.21881034 1.24382312 1.259549900 1.26638118
## 6Xs 7Xs 8Xs 9Xs 10Xs
## Age 0.20149636 0.20021763 0.1990540 0.19813601 0.19760080
## gp 0.10436893 0.10455909 0.1023667 0.09767239 0.09051081
## min 0.27609837 0.28122254 0.2863954 0.29083274 0.29393900
## ast -0.06074542 -0.08896621 -0.1161155 -0.14174425 -0.16536503
## oreb -0.26791949 -0.27727600 -0.2774498 -0.26944217 -0.25387682
## fta 0.13622786 0.13900043 0.1426556 0.14693799 0.15152709
## stl 0.47772894 0.46797791 0.4658055 0.46880077 0.47401157
## Height -0.07781629 -0.09129638 -0.1080872 -0.12715290 -0.14751522
## Shoes_New.Balance -0.72835035 -0.80682158 -0.8953217 -0.99458905 -1.10226842
## Shoes_Under.Armour 1.26370269 1.25037916 1.2253556 1.18812428 1.13869117
Relative Importance Based on the output below, free throws made (fta) and minutes played (min) are ranked as the important predictor of 3-P scores, followed closely by Shoes (Under Armour), then age and steals (stl). Scouts should pay more attention to these metrics when scouting for NBA players to play in the professional league as it will guarantee positive returns on their investments.
you will
NOTE: Your Data Analysis can be broken up into multiple pages if that helps with your organization.